#### Ponderador MatchDf
library(survey)
library(svrep)
library(tidyverse)

class(mayo.mt) <- "data.frame"
mayo.mt <- mayo.mt %>% mutate(educ_r = factor(case_when(
  educ_w01 >= 1 & educ_w01 <= 5 | educ_w01 == 99 ~ 1,
  educ_w01 == 6  ~ 2,
  educ_w01 == 7  ~ 3,
  educ_w01 == 8  ~ 4,
  educ_w01 == 9 | educ_w01 == 10 ~ 5
), labels = c("Escolar","Educacion Tecnica Superior Incompleta","Educacion Tecnica Superior Completa",
              "Profesional Incompleto","Profesional y Postgrado")),
edad_r = factor(case_when(
  edad_w01 %in% c(18:29) ~ 1,
  edad_w01 %in% c(30:39) ~ 2,
  edad_w01 %in% c(40:49) ~ 3,
  edad_w01 %in% c(50:100) ~ 4
),
labels = c("18 a 29","30 a 39","40 a 49", "50 a 65")
))

mayo.mt$educaFr_w03 <- factor(car::recode(mayo.mt$educ_w03, "1:5=1; 6:7=2; 8:10=3; 99=1"),
                         labels=c("Secondary Ed.", "Tertiary Tech.", "Tertiary Univ"))

mayo.mt$sec <- ifelse(mayo.mt$educaFr_w03 == "Secondary Ed.",1,0)
mayo.mt$ter_tec <- ifelse(mayo.mt$educaFr_w03 == "Tertiary Tech.",1,0)
mayo.mt$ter_univ <- ifelse(mayo.mt$educaFr_w03 == "Tertiary Univ",1,0)



# Porcentajes en CASEN

edad.dist <- data.frame(edad_r = c("18 a 29","30 a 39","40 a 49", "50 a 65"),
                        Freq = nrow(mayo.mt) *  c(0.29, 0.21, 0.19, 0.31))
educ.dist <- data.frame(educ_r = c("Escolar","Educacion Tecnica Superior Incompleta","Educacion Tecnica Superior Completa",
                                   "Profesional Incompleto","Profesional y Postgrado"),
                        Freq = nrow(mayo.mt) * c(0.58,0.04,0.09,0.11,0.18))


#mayo.mt.svy <- svydesign(ids = ~1, data = mayo.mt)
#mayo.mt.svy.rake <- rake(design = mayo.mt.svy, sample.margins = list(~educ_r,~edad_r),
#                         population.margins = list(educ.dist, edad.dist))

#summary(weights(mayo.mt.svy.rake))
#mayo.mt.svy.rake <- trimWeights(mayo.mt.svy.rake,
#                                lower=1/(median(weights(mayo.mt.svy.rake))+9*IQR(weights(mayo.mt.svy.rake))), 
#                                upper=median(weights(mayo.mt.svy.rake))+9*IQR(weights(mayo.mt.svy.rake)))


#summary(weights(mayo.mt.svy.rake))
#prop.table(svytable(~educaFr_w03,design = mayo.mt.svy.rake))*100
#prop.table(table(mayo.mt.svy.rake$variables["educaFr_w03"]))*100

k <- NULL
l <- list()

for (k in 2:20){
  # The weight is created
  
  mayo.mt.2.svy <- svydesign(ids = ~1, data = mayo.mt)
  mayo.mt.2.svy.rake <- rake(design = mayo.mt.2.svy, sample.margins = list(~educ_r,~edad_r),
                             population.margins = list(educ.dist, edad.dist))
  
  # A df with the values of education is created
  
  df <- as.data.frame(prop.table(table(mayo.mt.2.svy.rake$variables["educaFr_w03"])))
  colnames(df)[2] <- "Unweighted Mean"
  df <- merge(df,as.data.frame(prop.table(svytable(~educaFr_w03,design=mayo.mt.2.svy.rake))),by="educaFr_w03")
  colnames(df)[3] <- "Pre-Trim Mean"
  df <- cbind(df,Population = c(.61,.11,.25))
  df$mediana <- median(weights(mayo.mt.2.svy.rake))
  df$iqr <- IQR(weights(mayo.mt.2.svy.rake))
  max_pre_trim <- max(weights(mayo.mt.2.svy.rake))
  
  #The weight is trimmed
  
  mayo.mt.2.svy.rake <- trimWeights(mayo.mt.2.svy.rake,
                                    lower=1/(median(weights(mayo.mt.2.svy.rake))+k*IQR(weights(mayo.mt.2.svy.rake))), 
                                    upper=median(weights(mayo.mt.2.svy.rake))+k*IQR(weights(mayo.mt.2.svy.rake)))
  
  df <- cbind(df,as.data.frame(prop.table(svytable(~educaFr_w03, design=mayo.mt.2.svy.rake)))[2])
  colnames(df)[which(colnames(df)=="Freq")] <- "Weighted Mean" # This is the trimmed weight
  
  df$Variance <- 0
  df$Variance[1] <- svyvar(~sec,design=mayo.mt.2.svy.rake,na.rm=T)
  df$Variance[2] <- svyvar(~ter_tec,design=mayo.mt.2.svy.rake,na.rm=T)
  df$Variance[3] <- svyvar(~ter_univ,design=mayo.mt.2.svy.rake,na.rm=T)
  
  df$mse <- (df$`Weighted Mean`- df$Population)^2+df$Variance
  df$max_pre_trim <- max_pre_trim
  df$max_post_trim <- max(weights(mayo.mt.2.svy.rake))
  
  # A variable that informs the relative bias and the relative variance is created (Potter & Zheng, 2015, pg.2713)
  df$rel_bias <- 100*(df$Population-df$`Weighted Mean`)/df$Population
  df$rel_var <- 100*(df$`Weighted Mean`-df$`Pre-Trim Mean`)/df$`Pre-Trim Mean`
  
  df$k <- k
  
  # df is stored in a list  
  l[[k]] <- df
}

library(data.table)

df2 <- do.call("rbind", l)
df2 <- df2 %>% arrange(educaFr_w03) %>% group_by(educaFr_w03) %>% mutate(delta_mse = mse-dplyr::lag(mse,n=1))

mse_plot21mt <- ggplot(data=filter(df2,educaFr_w03!="Tertiary Tech."),aes(k, mse, color=educaFr_w03)) + 
  geom_line() + geom_point() + labs(x="K",y="MSE",color="Educational Level",
                                    title="MSE Reduction by each additional K") + theme_bw()

delta_plot21mt <- ggplot(data=filter(df2,educaFr_w03!="Tertiary Tech."), aes(k, delta_mse, color=educaFr_w03)) + 
  geom_line() + geom_point() + labs(x="K",y="Delta MSE",color="Educational Level",
                                    title="Delta MSE Reduction by each additional K") + theme_bw()

#Weight trimming

#Antes el k = 6

k <- 9

mayo.mt.2.svy <- svydesign(ids = ~1, data = mayo.mt)
mayo.mt.2.svy.rake <- rake(design = mayo.mt.2.svy, sample.margins = list(~educ_r,~edad_r),
                           population.margins = list(educ.dist,edad.dist))


max(weights(mayo.mt.2.svy.rake))

mayo.mt.2.svy.rake <- trimWeights(mayo.mt.2.svy.rake, lower=1/(median(weights(mayo.mt.2.svy.rake))+k*IQR(weights(mayo.mt.2.svy.rake))), 
                                  upper=median(weights(mayo.mt.2.svy.rake))+k*IQR(weights(mayo.mt.2.svy.rake)))

prop.table(svytable(~educaFr_w03, design=mayo.mt.2.svy.rake))


mayo.mt.w <- as_data_frame_with_weights(mayo.mt.2.svy.rake, full_wgt_name = "weight.match.educ.edad",
                                    rep_wgt_prefix = "REP_WGT_")

summary(mayo.mt.w$weight.match.educ.edad)
summary(mayo.mt.w$weight.match.educ)
#prop.table(wtd.table(mayo.mt.w$educaF_w03, weight=mayo.mt.w$weight.match.educ.edad))
